# load packages, installing if missing
if (!require(librarian)){
install.packages("librarian")
library(librarian)
}
librarian::shelf(
dismo, dplyr, DT, ggplot2, here, htmltools, leaflet, mapview, purrr, raster, readr, rgbif, rgdal, rJava, sdmpredictors, sf, spocc, tidyr, GGally, caret, pdp, ranger, rpart, rpart.plot, rsample, skimr, vip, maptools, usdm)
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = FALSE,
scipen = 999)
# set random seed for reproducibility
set.seed(42)
# directory to store data
dir_data <- here("data/sdm")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
mdl_maxv_rds <- file.path(dir_data, "mdl_maxent_vif.rds")
dir.create(dir_data, showWarnings = F)
# read raster stack of environment
#env_stack <- raster::stack(env_stack_grd)
The Species that I will be looking at is the Glaucomys sabrinus, or the northern flying squirrel.
url <- "https://cff2.earth.com/uploads/2017/01/03144442/Glaucomys-sabrinus-coloratus-750x400.jpg"
knitr::include_graphics(url)
obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo <- FALSE
if (!file.exists(obs_geo) | redo){
# get species occurrence data from GBIF with coordinates
(res <- spocc::occ(
query = 'Glaucomys sabrinus',
from = 'gbif', has_coords = T,
limit = 10000))
# extract data frame from result
df <- res$gbif$data[[1]]
readr::write_csv(df, obs_csv)
# convert to points of observation from lon/lat columns in data frame
obs <- df %>%
filter(latitude > 26, longitude > -160) %>%
sf::st_as_sf(
coords = c("longitude", "latitude"),
crs = st_crs(4326)) %>%
select(prov, key) # save space (joinable from obs_csv)
sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 3792
# show points on map
mapview::mapview(obs, map.types = "OpenTopoMap")
Question 1. How many observations total are in GBIF for your species? (Hint: ?occ)
As we can see in the df dataframe there are 3797 observations total in GBIF for my species.
Question 2. Do you see any odd observations, like marine species on land or vice versa? If so, please see the Data Cleaning and explain what you did to fix or remove these points.
Yes, there are some strange observations that are in South America and Africa, when this species should only be seen in North America. So I filtered the dataset by lon and lat so that only oberservations in North America will show up. This leaves us with 3792 observations.
dir_env <- file.path(dir_data, "env")
# set a default data directory
options(sdmpredictors_datadir = dir_env)
# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)
# show table of datasets
env_datasets %>%
select(dataset_code, description, citation) %>%
DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")
# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "ER_tri", "ER_topoWet")
# get layers
env_stack <- load_layers(env_layers_vec)
# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)
obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
if (!file.exists(obs_hull_geo) | redo){
# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
# save obs hull
write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)
# show points on map
mapview(
list(obs, obs_hull))
if (!file.exists(env_stack_grd) | redo){
obs_hull_sp <- sf::as_Spatial(obs_hull)
env_stack <- raster::mask(env_stack, obs_hull_sp) %>%
raster::crop(extent(obs_hull_sp))
writeRaster(env_stack, env_stack_grd, overwrite=T)
}
env_stack <- stack(env_stack_grd)
# show map
# mapview(obs) +
# mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)
###### Psuedo-Absense Points
absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
if (!file.exists(absence_geo) | redo){
# get raster count of observations
r_obs <- rasterize(
sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
# show map
# mapview(obs) +
# mapview(r_obs)
# create mask for
r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
# generate random points inside mask
absence <- dismo::randomPoints(r_mask, nrow(obs)) %>%
as_tibble() %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)
# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") +
mapview(absence, col.regions = "gray")
if (!file.exists(pts_env_csv) | redo){
# combine presence and absence into single set of labeled points
pts <- rbind(
obs %>%
mutate(
present = 1) %>%
select(present, key),
absence %>%
mutate(
present = 0,
key = NA)) %>%
mutate(
ID = 1:n()) %>%
relocate(ID)
write_sf(pts, pts_geo, delete_dsn=T)
# extract raster values for points
pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>%
tibble() %>%
# join present and geometry columns to raster value results for points
left_join(
pts %>%
select(ID, present),
by = "ID") %>%
relocate(present, .after = ID) %>%
# extract lon, lat as single columns
mutate(
#present = factor(present),
lon = st_coordinates(geometry)[,1],
lat = st_coordinates(geometry)[,2]) %>%
select(-geometry)
write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)
pts_env %>%
# show first 10 presence, last 10 absence
slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>%
DT::datatable(
rownames = F,
options = list(
dom = "t",
pageLength = 20))
pts_env %>%
select(-ID) %>%
mutate(
present = factor(present)) %>%
pivot_longer(-present) %>%
ggplot() +
geom_density(aes(x = value, fill = present)) +
scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
theme_bw() +
facet_wrap(~name, scales = "free") +
theme(
legend.position = c(1, 0),
legend.justification = c(1, 0))
# Look at the data in a dataframe.
datatable(pts_env, rownames = F)
# Now look at it in a pairs plot.
GGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present)))
# setup model data
d <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 7551
# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
##
## Call:
## lm(formula = present ~ ., data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2046 -0.3802 -0.0042 0.4029 1.0084
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.74328211 0.18302368 25.916 < 0.0000000000000002 ***
## WC_alt -0.00025023 0.00002043 -12.249 < 0.0000000000000002 ***
## WC_bio1 -0.03670053 0.00367575 -9.985 < 0.0000000000000002 ***
## WC_bio2 -0.05703474 0.00337749 -16.887 < 0.0000000000000002 ***
## ER_tri -0.00183074 0.00025146 -7.280 0.000000000000367 ***
## ER_topoWet -0.12936781 0.00595308 -21.731 < 0.0000000000000002 ***
## lon -0.00822231 0.00055788 -14.738 < 0.0000000000000002 ***
## lat -0.05503729 0.00334892 -16.434 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4407 on 7543 degrees of freedom
## Multiple R-squared: 0.224, Adjusted R-squared: 0.2233
## F-statistic: 311 on 7 and 7543 DF, p-value: < 0.00000000000000022
y_predict <- predict(mdl, d, type="response")
y_true <- d$present
range(y_predict)
## [1] -0.1133006 1.2159379
range(y_true)
## [1] 0 1
# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7173 -0.9518 -0.3569 0.9778 2.2628
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 22.054337 1.048234 21.040 < 0.0000000000000002 ***
## WC_alt -0.001326 0.000113 -11.739 < 0.0000000000000002 ***
## WC_bio1 -0.200805 0.020279 -9.902 < 0.0000000000000002 ***
## WC_bio2 -0.296798 0.018386 -16.143 < 0.0000000000000002 ***
## ER_tri -0.010043 0.001334 -7.529 0.000000000000051 ***
## ER_topoWet -0.655803 0.032787 -20.002 < 0.0000000000000002 ***
## lon -0.043951 0.003068 -14.324 < 0.0000000000000002 ***
## lat -0.290039 0.018853 -15.384 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10467.9 on 7550 degrees of freedom
## Residual deviance: 8588.5 on 7543 degrees of freedom
## AIC: 8604.5
##
## Number of Fisher Scoring iterations: 4
y_predict <- predict(mdl, d, type="response")
range(y_predict)
## [1] 0.04340474 0.97714602
# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F, ylim="free")
librarian::shelf(mgcv)
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
formula = present ~ s(WC_alt) + s(WC_bio1) +
s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat),
family = binomial, data = d)
summary(mdl)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) +
## s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.45748 0.05898 -7.757 0.00000000000000871 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_alt) 8.258 8.848 362.52 < 0.0000000000000002 ***
## s(WC_bio1) 8.525 8.841 499.93 < 0.0000000000000002 ***
## s(WC_bio2) 5.705 6.733 116.01 < 0.0000000000000002 ***
## s(ER_tri) 7.846 8.670 54.80 < 0.0000000000000002 ***
## s(ER_topoWet) 8.667 8.964 42.97 0.00000146 ***
## s(lon) 8.376 8.878 363.69 < 0.0000000000000002 ***
## s(lat) 8.358 8.888 248.82 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.501 Deviance explained = 43.2%
## UBRE = -0.19737 Scale est. = 1 n = 7551
# show term plots
plot(mdl, scale=0)
# load extra packages
librarian::shelf(
maptools, sf)
mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")
# show version of maxent
if (!interactive())
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)
# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>%
sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class
# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds)){
mdl <- maxent(env_stack, obs_sp)
readr::write_rds(mdl, mdl_maxent_rds)
}
mdl <- read_rds(mdl_maxent_rds)
# plot variable contributions per predictor
plot(mdl)
# plot term plots
response(mdl)
# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')
plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
# graphical theme
ggplot2::theme_set(ggplot2::theme_light())
# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>%
select(-ID) %>% # not used as a predictor x
mutate(
present = factor(present)) %>% # categorical response
na.omit() # drop rows with NA
skim(d)
| Name | d |
| Number of rows | 7551 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| present | 0 | 1 | FALSE | 2 | 0: 3785, 1: 3766 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| WC_alt | 0 | 1 | 715.32 | 690.71 | -57.00 | 215.00 | 416.00 | 1081.00 | 3611.00 | ▇▂▂▁▁ |
| WC_bio1 | 0 | 1 | 4.28 | 5.30 | -12.30 | 1.10 | 4.90 | 7.50 | 23.40 | ▁▃▇▂▁ |
| WC_bio2 | 0 | 1 | 11.64 | 2.64 | 4.00 | 10.30 | 11.60 | 13.00 | 19.90 | ▁▃▇▃▁ |
| ER_tri | 0 | 1 | 42.67 | 45.97 | 0.00 | 7.22 | 22.32 | 69.34 | 274.59 | ▇▂▁▁▁ |
| ER_topoWet | 0 | 1 | 10.69 | 1.92 | 6.73 | 8.98 | 10.78 | 12.24 | 15.22 | ▅▇▇▇▂ |
| lon | 0 | 1 | -105.00 | 21.97 | -154.71 | -122.12 | -110.50 | -85.46 | -52.85 | ▁▇▅▅▂ |
| lat | 0 | 1 | 48.11 | 7.24 | 33.88 | 43.21 | 46.94 | 53.38 | 66.12 | ▃▇▆▃▂ |
# create training set with 80% of full data
d_split <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train <- rsample::training(d_split)
# show number of rows present is 0 vs 1
table(d$present)
##
## 0 1
## 3785 3766
table(d_train$present)
##
## 0 1
## 3028 3012
# run decision stump model
mdl <- rpart(
present ~ ., data = d_train,
control = list(
cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 6040
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 6040 3012 0 (0.5013245 0.4986755)
## 2) WC_bio1< 1.95 1766 393 0 (0.7774632 0.2225368) *
## 3) WC_bio1>=1.95 4274 1655 1 (0.3872251 0.6127749) *
# plot tree
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)
# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 6040
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 6040 3012 0 (0.50132450 0.49867550)
## 2) WC_bio1< 1.95 1766 393 0 (0.77746319 0.22253681)
## 4) lat>=47.23326 1628 293 0 (0.82002457 0.17997543)
## 8) lon>=-146.8239 1495 204 0 (0.86354515 0.13645485) *
## 9) lon< -146.8239 133 44 1 (0.33082707 0.66917293) *
## 5) lat< 47.23326 138 38 1 (0.27536232 0.72463768) *
## 3) WC_bio1>=1.95 4274 1655 1 (0.38722508 0.61277492)
## 6) lat< 42.14012 1114 344 0 (0.69120287 0.30879713)
## 12) ER_topoWet>=9.33 752 67 0 (0.91090426 0.08909574) *
## 13) ER_topoWet< 9.33 362 85 1 (0.23480663 0.76519337) *
## 7) lat>=42.14012 3160 885 1 (0.28006329 0.71993671)
## 14) WC_bio2>=12.85 608 282 0 (0.53618421 0.46381579)
## 28) ER_topoWet>=10.435 302 47 0 (0.84437086 0.15562914) *
## 29) ER_topoWet< 10.435 306 71 1 (0.23202614 0.76797386) *
## 15) WC_bio2< 12.85 2552 559 1 (0.21904389 0.78095611) *
rpart.plot(mdl)
# plot complexity parameter
plotcp(mdl)
# rpart cross validation results
mdl$cptable
## CP nsplit rel error xerror xstd
## 1 0.32005312 0 1.0000000 1.0405046 0.01289211
## 2 0.14143426 1 0.6799469 0.6809429 0.01221916
## 3 0.06374502 2 0.5385126 0.5577689 0.01156177
## 4 0.03452855 3 0.4747676 0.4810757 0.01101830
## 5 0.02058433 5 0.4057105 0.4262948 0.01055674
## 6 0.01494024 6 0.3851262 0.4000664 0.01031142
## 7 0.01000000 7 0.3701859 0.3844622 0.01015733
# caret cross validation results
mdl_caret <- train(
present ~ .,
data = d_train,
method = "rpart",
trControl = trainControl(method = "cv", number = 10),
tuneLength = 20)
ggplot(mdl_caret)
vip(mdl_caret, num_features = 40, bar = FALSE)
# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>%
plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE,
colorkey = TRUE, screen = list(z = -20, x = -60))
# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)
# number of features
n_features <- length(setdiff(names(d_train), "present"))
# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)
# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.3117458
# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
present ~ ., data = d_train,
importance = "impurity")
# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
present ~ ., data = d_train,
importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)
gridExtra::grid.arrange(p1, p2, nrow = 1)
# read points of observation: presence (1) and absence (0)
pts <- read_sf(pts_geo)
# create training set with 80% of full data
pts_split <- rsample::initial_split(
pts, prop = 0.8, strata = "present")
pts_train <- rsample::training(pts_split)
pts_test <- rsample::testing(pts_split)
pts_train_p <- pts_train %>%
filter(present == 1) %>%
as_Spatial()
pts_train_a <- pts_train %>%
filter(present == 0) %>%
as_Spatial()
# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)
# calculate variance inflation factor per predictor, a metric of multicollinearity between variables
vif(env_stack)
## Variables VIF
## 1 WC_alt 3.724311
## 2 WC_bio1 1.697796
## 3 WC_bio2 3.465192
## 4 ER_tri 4.392701
## 5 ER_topoWet 4.091482
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th=0.7)
v
## 1 variables from the 5 input variables have collinearity problem:
##
## ER_tri
##
## After excluding the collinear variables, the linear correlation coefficients ranges between:
## min correlation ( WC_bio1 ~ WC_alt ): 0.02400813
## max correlation ( ER_topoWet ~ WC_alt ): -0.558568
##
## ---------- VIFs of the remained variables --------
## Variables VIF
## 1 WC_alt 3.084384
## 2 WC_bio1 1.584131
## 3 WC_bio2 2.847255
## 4 ER_topoWet 2.058726
# reduce enviromental raster stack by
env_stack_v <- usdm::exclude(env_stack, v)
# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)
# fit a maximum entropy model
if (!file.exists(mdl_maxv_rds)){
mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
readr::write_rds(mdl_maxv, mdl_maxv_rds)
}
mdl_maxv <- read_rds(mdl_maxv_rds)
# plot variable contributions per predictor
plot(mdl_maxv)
# plot term plots
response(mdl_maxv)
# predict
y_maxv <- predict(env_stack, mdl_maxv) #, ext=ext, progress='')
plot(y_maxv, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
pts_test_p <- pts_test %>%
filter(present == 1) %>%
as_Spatial()
pts_test_a <- pts_test %>%
filter(present == 0) %>%
as_Spatial()
y_maxv <- predict(mdl_maxv, env_stack)
#plot(y_maxv)
e <- dismo::evaluate(
p = pts_test_p,
a = pts_test_a,
model = mdl_maxv,
x = env_stack)
e
## class : ModelEvaluation
## n presences : 756
## n absences : 758
## AUC : 0.8389856
## cor : 0.5861861
## max TPR+TNR at : 0.656592
plot(e, 'ROC')
thr <- threshold(e)[['spec_sens']]
thr
## [1] 0.656592
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)
# (t)rue/(f)alse (p)ositive/(n)egative rates
tpr <- sum(p_true)/length(p_true)
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)
matrix(
c(tpr, fnr,
fpr, tnr),
nrow=2, dimnames = list(
c("present_obs", "absent_obs"),
c("present_pred", "absent_pred")))
## present_pred absent_pred
## present_obs 0.8148148 0.2493404
## absent_obs 0.1851852 0.7506596
# add point to ROC plot
plot(e, 'ROC')
points(fpr, tpr, pch=23, bg="blue")
plot(y_maxv > thr)